# Match estimated topics from DTM with topics from the Policy Agenda
# (PA) project. This is done through a comparison of the top 50 words
# in a DTM topic with the top 100 words of each PA topic description,
# using Jaccard inded to find the word lists that overlap the most.

for topWords in (10,15,20,25,30,35,40,45,50):
    # lists for holding DTM and PA topics
    topicsDTM = list()
    topicsPA = list()
    topicsPALabels = list()
     
    # open DTM vocabulary list
    fnameIn = "%s%s%s" % ("./data/DTM_word_list_22_topics_top_",topWords,"_words.txt")
    f1 = open(fnameIn,"r")
    for line in f1:
      l = list()
      for word in line.split():
        l.append(word.strip())
      topicsDTM.append(set(l))
    f1.close()
     
    # generate topic numbers for DTM topics
    dtmTopicNumbers = range(len(topicsDTM))
     
    # open policy agenda topics list (= topicsPA)
    f2 = open("./data/policy_agenda_word_lists.txt","r")
    for line in f2:
      l = list()
      for word in line.split():
        l.append(word.strip())
      topicsPA.append(set(l))
    f2.close()
     
    # read policy agenda topic labels
    f3 = open("./data/policy_agenda_topic_labels.txt","r")
    for line in f3:
      topicsPALabels.append(line.strip())
    f3.close()
    
    # delete government operations topic
    delIndex = topicsPALabels.index("government operations")
    del topicsPA[delIndex]
    del topicsPALabels[delIndex]
          
    # open outfile and write header
    fnameOut = "%s%s%s" % ("./data/matches_DTM_model_22_topics_top_",topWords,"_words.csv")
     
    outfile = open(fnameOut,"w")
    header = "%s,%s,%s,%s,%s\n" % ("dtm.topic.number", "pa.label.jaccard", "jaccard.value", "dtm.topwords", "pa.topwords.jaccard") 
    outfile.write(header)
     
     
    # find maximum jaccard values for all matchings
    for i in range(0,len(topicsDTM)):
      ## jaccard
      maxPaJaccard = -1
      maxValJaccard = -1
      for j in range(0,len(topicsPA)):
    
        # calculate Jaccard value
        jaccard = (len(topicsDTM[i].intersection(topicsPA[j]))/float(len(topicsDTM[i].union(topicsPA[j]))))
    
        # normalize by size of PA topic set
        jaccard = jaccard/float(len(topicsDTM[i])/float(len(topicsPA[j])))
        
        if jaccard > maxValJaccard:
          maxPaJaccard = j
          maxValJaccard = jaccard
            
          
      # write current results to output file
      out = "%s,%s,%s,%s,%s\n" % (str(dtmTopicNumbers[i]), str(topicsPALabels[maxPaJaccard]), str(round(jaccard,2)), " ".join(topicsDTM[i]), " ".join(topicsPA[maxPaJaccard]))
      outfile.write(out)
     
     
    outfile.close()

